Thesis Data Work

Author

Aaron DiLorenzo

Published

February 15, 2026

1 python pqi script

Show code
"""
OECD Patent Quality Indicators
normalization, winsorization, and aggregation procedures
"""

import polars as pl
import os

OUTPUT_DIR = "/Users/aarondilorenzo/Desktop/DATATHES/FINAL WORKING DATA"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# for reproducing, change the file paths to correctly the downloaded data, 
# they're too large for me to make this self running
INDICATORS_FILE = "Data/202502_OECD_PATENT_QUALITY_EPO_INDIC.txt"
COHORTS_FILE = "Data/202502_OECD_PATENT_QUALITY_EPO_INDIC_COHORT.txt"
HAN_PATENTS_FILE = "Data/202502_HAN_PATENTS.txt"
HARM_NAMES_FILE = "Data/DATATHES/202502_HARM_NAMES.txt"

FINAL_PATENT_FILE = "OECD_patent_quality_final.parquet"
FINAL_COUNTRY_FILE = "OECD_country_year_final.csv"

WINSOR_VARS = ["family_size", "grant_lag", "bwd_cits", "npl_cits", "claims", "fwd_cits5", "fwd_cits7"]
COMPOSITE_VARS = ["generality", "originality", "radicalness", "quality_index_4", "quality_index_6"]

# load the data

def load_data():

    cohorts = pl.read_csv(COHORTS_FILE, separator="|", infer_schema_length=0)
    cohorts = cohorts.rename({c: c.strip() for c in cohorts.columns})

    indicators = pl.read_csv(INDICATORS_FILE, separator="|", infer_schema_length=10000)
    indicators = indicators.rename({c: c.strip() for c in indicators.columns})

    han = pl.read_csv(HAN_PATENTS_FILE, separator="|")
    han = han.rename({c: c.strip() for c in han.columns})
    han = han.rename({"Appln_id": "appln_id"})

    harm = pl.read_csv(HARM_NAMES_FILE, separator="|")
    harm = harm.rename({c: c.strip() for c in harm.columns})
    harm = harm.select(["HARM_ID", "Person_ctry_code"])

    han = han.join(harm, on="HARM_ID", how="left")

    return cohorts, indicators, han

# main function

def main():

    cohorts, indicators, han = load_data()

    merged = indicators.join(han, on="appln_id", how="left")

    # ensure breakthrough is binary
    if "breakthrough" in merged.columns:
        merged = merged.with_columns(
            pl.col("breakthrough").fill_null(0).cast(pl.Int8)
        )

    # aggregation to country/year

    agg_exprs = []

    # normalized indicators
    norm_cols = [c for c in merged.columns if c.endswith("_norm")]
    for c in norm_cols:
        agg_exprs.append(pl.mean(c).alias(f"avg_{c}"))

    # mean for composite indicator
    for c in COMPOSITE_VARS:
        if c in merged.columns:
            agg_exprs.append(pl.mean(c).alias(f"avg_{c}"))

    # breakthrough counts
    agg_exprs.extend([
        pl.sum("breakthrough").alias("breakthrough_count"),
        pl.count("appln_id").alias("patent_count"),
        (pl.sum("breakthrough") / pl.count("appln_id")).alias("pct_breakthrough")
    ])

    country_agg = (
        merged
        .filter(pl.col("Person_ctry_code").is_not_null())
        .group_by(["Person_ctry_code", "filing"])
        .agg(agg_exprs)
        .rename({"Person_ctry_code": "iso3c", "filing": "year"})
        .sort(["iso3c", "year"])
    )

    country_agg.write_csv(
        os.path.join(OUTPUT_DIR, FINAL_COUNTRY_FILE)
    )

    merged.write_parquet(
        os.path.join(OUTPUT_DIR, FINAL_PATENT_FILE)
    )

if __name__ == "__main__":
    main()

2 data construction

Show code
# data construction script
# pivoting datasets, interpolating the Park index, and adding iso3c keys not included in this script
rm(list = ls())

library(data.table)
library(dplyr)

options(dplyr.summarise.inform = FALSE)

# load individual datasets
pqi      <- fread("data/OECD_country_year_final.csv")
fd       <- fread("data/FD_Final.csv")
vc       <- fread("data/VC_final.csv")
counts   <- fread("data/counts_res_final.csv")
controls <- fread("data/Controls_final.csv")
park     <- fread("data/Park_final.csv")

# ------------------------------------------------------------
# CLEAN MISSING VALUES
# ------------------------------------------------------------

clean_missing <- function(df) {
  df[df == ".."] <- NA
  df[df == "NA"] <- NA
  df[df == ""]   <- NA
  return(df)
}

pqi      <- clean_missing(pqi)
fd       <- clean_missing(fd)
vc       <- clean_missing(vc)
counts   <- clean_missing(counts)
controls <- clean_missing(controls)
park     <- clean_missing(park)

# ------------------------------------------------------------
# REMOVE DUPLICATE RESEARCHER COLUMN FROM COUNTS
# ------------------------------------------------------------

counts <- counts %>%
  select(-researchers_per_million)

# ------------------------------------------------------------
# MERGE DATASETS STEP BY STEP
# ------------------------------------------------------------

df <- pqi %>%
  left_join(park %>% select(iso3c, year, ipp_index_IP_transformed),
            by = c("iso3c", "year")) %>%
  left_join(fd,       by = c("iso3c", "year")) %>%
  left_join(vc,       by = c("iso3c", "year")) %>%
  left_join(counts,   by = c("iso3c", "year")) %>%
  left_join(controls, by = c("iso3c", "year"))

# ------------------------------------------------------------
# FORCE NUMERIC VARIABLES
# ------------------------------------------------------------

numeric_vars <- c(
  "researchers_per_million",
  "population_total",
  "gdp_pc_const2015usd",
  "bank_credit_gdp",
  "market_cap_gdp",
  "VC_investment",
  "trade_percent_gdp",
  "rd_expenditure_percent_gdp",
  "tertiary_enrollment_percent",
  "patent_applications_residents",
  "breakthrough_count",
  "patent_count",
  "ipp_index_IP_transformed"
)

for (v in numeric_vars) {
  if (v %in% names(df)) {
    df[[v]] <- as.numeric(df[[v]])
  }
}

# ------------------------------------------------------------
# DERIVED VARIABLES
# ------------------------------------------------------------

df <- df %>%
  mutate(
    # Researchers total
    researchers_total =
      researchers_per_million * (population_total / 1e6),
    
    # Patents per 1000 researchers
    patents_per_1000_researchers =
      (patent_applications_residents / researchers_total) * 1000,
    
    patents_per_1000_researchers =
      ifelse(
        is.infinite(patents_per_1000_researchers) |
        patents_per_1000_researchers < 0,
        NA,
        patents_per_1000_researchers
      ),
    
    # Scale quality index
    quality_index_100 = avg_quality_index_4 * 100,
    
    # VC scaling
    VC_investment_pct = VC_investment * 100,
    
    # Log transforms
    ln_patents   = log(patents_per_1000_researchers + 1),
    ln_gdp_pc    = log(gdp_pc_const2015usd + 1),
    ln_bank      = log(bank_credit_gdp + 1),
    ln_market    = log(market_cap_gdp + 1),
    ln_vc        = log(VC_investment_pct + 1),
    ln_trade     = log(trade_percent_gdp + 1),
    ln_rd        = log(rd_expenditure_percent_gdp + 1),
    ln_tertiary  = log(tertiary_enrollment_percent + 1),
    
    # Mean-center IPR
    ipr_c =
      ipp_index_IP_transformed -
      mean(ipp_index_IP_transformed, na.rm = TRUE),
    
    # Interaction terms
    ln_bank_x_ipr   = ln_bank   * ipr_c,
    ln_market_x_ipr = ln_market * ipr_c,
    ln_vc_x_ipr     = ln_vc     * ipr_c
  )

# ------------------------------------------------------------
# SAVE FINAL PANEL
# ------------------------------------------------------------

fwrite(df, "data/monarch_panel_transformed.csv")

summary(df)
     iso2                year      avg_generality   avg_originality 
 Length:2783        Min.   :1978   Min.   :0.0000   Min.   :0.0000  
 Class :character   1st Qu.:1991   1st Qu.:0.2917   1st Qu.:0.6132  
 Mode  :character   Median :2003   Median :0.3499   Median :0.6828  
                    Mean   :2002   Mean   :0.3480   Mean   :0.6628  
                    3rd Qu.:2013   3rd Qu.:0.4034   3rd Qu.:0.7231  
                    Max.   :2024   Max.   :0.8786   Max.   :0.9517  
                                   NA's   :340      NA's   :2       
 avg_radicalness  avg_quality_index_4 avg_quality_index_6 breakthrough_count
 Min.   :0.0000   Min.   :0.04289     Min.   :0.08356     Min.   :  0.000   
 1st Qu.:0.2443   1st Qu.:0.22594     1st Qu.:0.28682     1st Qu.:  0.000   
 Median :0.3042   Median :0.26274     Median :0.31355     Median :  0.000   
 Mean   :0.3035   Mean   :0.26228     Mean   :0.31233     Mean   :  4.851   
 3rd Qu.:0.3611   3rd Qu.:0.29348     3rd Qu.:0.33399     3rd Qu.:  1.000   
 Max.   :1.0000   Max.   :0.68455     Max.   :0.58308     Max.   :264.000   
                  NA's   :205         NA's   :205                           
  patent_count     pct_breakthrough      iso3c          
 Min.   :    1.0   Min.   :0.000000   Length:2783       
 1st Qu.:   11.0   1st Qu.:0.000000   Class :character  
 Median :   66.0   Median :0.000000   Mode  :character  
 Mean   : 1619.9   Mean   :0.002478                     
 3rd Qu.:  588.5   3rd Qu.:0.001188                     
 Max.   :44465.0   Max.   :0.500000                     
                                                        
 ipp_index_IP_transformed Country Name.x     CM.MKT.LCAP.GD.ZS 
 Min.   :0.200            Length:2783        Length:2783       
 1st Qu.:2.892            Class :character   Class :character  
 Median :3.742            Mode  :character   Mode  :character  
 Mean   :3.443                                                 
 3rd Qu.:4.333                                                 
 Max.   :5.000                                                 
 NA's   :842                                                   
     Code           bank_credit_gdp   FS.AST.PRVT.GD.ZS  market_cap_gdp     
 Length:2783        Min.   :  6.828   Length:2783        Min.   :   0.0231  
 Class :character   1st Qu.: 43.284   Class :character   1st Qu.:  22.0505  
 Mode  :character   Median : 71.454   Mode  :character   Median :  43.3967  
                    Mean   : 82.128                      Mean   :  74.5402  
                    3rd Qu.:113.691                      3rd Qu.:  86.1512  
                    Max.   :304.575                      Max.   :1777.2263  
                    NA's   :1038                         NA's   :1042       
   country          VC_investment    Country Name.y    
 Length:2783        Min.   :0.0000   Length:2783       
 Class :character   1st Qu.:0.0132   Class :character  
 Mode  :character   Median :0.0298   Mode  :character  
                    Mean   :0.0862                     
                    3rd Qu.:0.0651                     
                    Max.   :4.7556                     
                    NA's   :2141                       
 patent_applications_residents Country Name       rd_expenditure_percent_gdp
 Min.   :      1.0             Length:2783        Min.   :0.0423            
 1st Qu.:    240.8             Class :character   1st Qu.:0.6386            
 Median :   1112.0             Mode  :character   Median :1.1858            
 Mean   :  20985.8                                Mean   :1.4526            
 3rd Qu.:   3062.8                                3rd Qu.:2.1022            
 Max.   :1426644.0                                Max.   :6.0192            
 NA's   :803                                      NA's   :1447              
 trade_percent_gdp gdp_pc_const2015usd tertiary_enrollment_percent
 Min.   :  9.10    Min.   :   394.2    Min.   :  1.556            
 1st Qu.: 50.00    1st Qu.: 10358.2    1st Qu.: 37.808            
 Median : 70.13    Median : 22716.6    Median : 59.995            
 Mean   : 92.48    Mean   : 27724.5    Mean   : 57.467            
 3rd Qu.:113.90    3rd Qu.: 38003.8    3rd Qu.: 75.660            
 Max.   :442.62    Max.   :167187.2    Max.   :166.666            
 NA's   :434       NA's   :344         NA's   :1441               
 researchers_per_million population_total    researchers_total  
 Min.   :   38.11        Min.   :1.113e+04   Min.   :     12.1  
 1st Qu.: 1242.48        1st Qu.:3.457e+06   1st Qu.:   9771.8  
 Median : 2681.98        Median :9.967e+06   Median :  31718.9  
 Mean   : 2931.88        Mean   :7.030e+07   Mean   : 132168.0  
 3rd Qu.: 4310.71        3rd Qu.:4.428e+07   3rd Qu.:  95538.1  
 Max.   :20047.05        Max.   :1.451e+09   Max.   :2611445.8  
 NA's   :1578            NA's   :157         NA's   :1578       
 patents_per_1000_researchers quality_index_100 VC_investment_pct
 Min.   :  0.4355             Min.   : 4.289    Min.   :  0.000  
 1st Qu.: 19.7081             1st Qu.:22.594    1st Qu.:  1.320  
 Median : 35.8039             Median :26.274    Median :  2.978  
 Mean   : 70.9319             Mean   :26.228    Mean   :  8.622  
 3rd Qu.: 58.9592             3rd Qu.:29.348    3rd Qu.:  6.508  
 Max.   :754.1795             Max.   :68.455    Max.   :475.559  
 NA's   :1687                 NA's   :205       NA's   :2141     
   ln_patents       ln_gdp_pc         ln_bank        ln_market     
 Min.   :0.3615   Min.   : 5.979   Min.   :2.058   Min.   :0.0229  
 1st Qu.:3.0305   1st Qu.: 9.246   1st Qu.:3.791   1st Qu.:3.1377  
 Median :3.6056   Median :10.031   Median :4.283   Median :3.7932  
 Mean   :3.6313   Mean   : 9.861   Mean   :4.221   Mean   :3.7583  
 3rd Qu.:4.0937   3rd Qu.:10.546   3rd Qu.:4.742   3rd Qu.:4.4676  
 Max.   :6.6270   Max.   :12.027   Max.   :5.722   Max.   :7.4834  
 NA's   :1687     NA's   :344      NA's   :1038    NA's   :1042    
     ln_vc           ln_trade         ln_rd         ln_tertiary    
 Min.   :0.0000   Min.   :2.312   Min.   :0.0414   Min.   :0.9384  
 1st Qu.:0.8414   1st Qu.:3.932   1st Qu.:0.4939   1st Qu.:3.6586  
 Median :1.3808   Median :4.264   Median :0.7820   Median :4.1108  
 Mean   :1.5160   Mean   :4.321   Mean   :0.8159   Mean   :3.9219  
 3rd Qu.:2.0160   3rd Qu.:4.744   3rd Qu.:1.1321   3rd Qu.:4.3394  
 Max.   :6.1666   Max.   :6.095   Max.   :1.9487   Max.   :5.1220  
 NA's   :2141     NA's   :434     NA's   :1447     NA's   :1441    
     ipr_c         ln_bank_x_ipr      ln_market_x_ipr   ln_vc_x_ipr    
 Min.   :-3.2426   Min.   :-10.6344   Min.   :-8.875   Min.   :0.0000  
 1st Qu.:-0.5509   1st Qu.: -0.8462   1st Qu.:-0.495   1st Qu.:0.4177  
 Median : 0.2991   Median :  1.8150   Median : 1.488   Median :1.2777  
 Mean   : 0.0000   Mean   :  1.1773   Mean   : 1.270   Mean   :1.3190  
 3rd Qu.: 0.8908   3rd Qu.:  4.1975   3rd Qu.: 3.819   3rd Qu.:1.8626  
 Max.   : 1.5574   Max.   :  7.7365   Max.   : 7.387   Max.   :5.8230  
 NA's   :842       NA's   :1418       NA's   :1324     NA's   :2355    

3 baseline twfe

Show code
# baseline twfe models

rm(list = ls())

library(data.table)
library(dplyr)
library(plm)
library(lmtest)
library(sandwich)
library(stargazer)

options(dplyr.summarise.inform = FALSE)

# load data

df <- fread("data/monarch_panel_transformed.csv")


if (!dir.exists("tables")) {
  dir.create("tables")
}

# clustered se

cluster_se <- function(model) {
  sqrt(diag(vcovHC(model, type = "HC1", cluster = "group")))
}

# function

run_twfe_incremental <- function(depvar, fd, fd_x_ipr, outfile, title) {

# maximize samples by finance type

  base <- df %>%
    filter(
      !is.na(.data[[depvar]]),
      !is.na(.data[[fd]]),
      !is.na(ipr_c)
    )

  panel <- pdata.frame(base, index = c("iso3c", "year"))
# specifications

  f1 <- as.formula(paste(depvar, "~", fd, "+ ipr_c +", fd_x_ipr))
  f2 <- update(f1, . ~ . + ln_gdp_pc)
  f3 <- update(f2, . ~ . + ln_trade)
  f4 <- update(f3, . ~ . + ln_rd)
  f5 <- update(f4, . ~ . + ln_tertiary)

  m1 <- plm(f1, data = panel, model = "within", effect = "twoways")
  m2 <- plm(f2, data = panel, model = "within", effect = "twoways")
  m3 <- plm(f3, data = panel, model = "within", effect = "twoways")
  m4 <- plm(f4, data = panel, model = "within", effect = "twoways")
  m5 <- plm(f5, data = panel, model = "within", effect = "twoways")
  
  html_table <- capture.output(
  stargazer(
    m1, m2, m3, m4, m5,
    type = "html",
    title = title,
    dep.var.caption = ifelse(
      depvar == "quality_index_100",
      "Dependent Variable: Patent Quality (0–100)",
      "Dependent Variable: ln(Patents per 1,000 Researchers)"
    ),
    se = list(
      cluster_se(m1),
      cluster_se(m2),
      cluster_se(m3),
      cluster_se(m4),
      cluster_se(m5)
    ),
    omit.stat = c("ser", "f"),
    notes = c(
      "Country and year fixed effects included in all models.",
      "Standard errors clustered by country.",
      "* p<0.1; ** p<0.05; *** p<0.01"
    ),
    no.space = TRUE
  )
)

cat(paste(html_table, collapse = "\n"))


# latex tables
  stargazer(
    m1, m2, m3, m4, m5,
    type = "latex",
    out  = outfile,
    title = title,
    dep.var.caption = ifelse(
      depvar == "quality_index_100",
      "Dependent Variable: Patent Quality (0–100)",
      "Dependent Variable: ln(Patents per 1,000 Researchers)"
    ),
    se = list(
      cluster_se(m1),
      cluster_se(m2),
      cluster_se(m3),
      cluster_se(m4),
      cluster_se(m5)
    ),
    omit.stat = c("ser", "f"),
    notes = c(
      "Country and year fixed effects included in all models.",
      "Standard errors clustered by country.",
      "* p<0.1; ** p<0.05; *** p<0.01"
    ),
    no.space = TRUE
  )

  return(list(m1, m2, m3, m4, m5))
}

# quality models

market_quality <- run_twfe_incremental(
  depvar   = "quality_index_100",
  fd       = "ln_market",
  fd_x_ipr = "ln_market_x_ipr",
  outfile  = "tables/twfe_market_quality.tex",
  title    = "Market Finance and Patent Quality"
)
Market Finance and Patent Quality
Dependent Variable: Patent Quality (0–100)
quality_index_100
(1) (2) (3) (4) (5)
ln_market 0.390 0.364 0.390 0.975** 0.602
(0.281) (0.300) (0.316) (0.396) (0.632)
ipr_c 1.751* 2.036** 2.068** 4.285** 3.859***
(0.907) (0.937) (0.952) (1.946) (1.214)
ln_market_x_ipr -0.651*** -0.676*** -0.685*** -1.168*** -1.053**
(0.229) (0.232) (0.232) (0.394) (0.430)
ln_gdp_pc -0.788 -0.837 -2.337* -0.819
(1.821) (1.856) (1.386) (1.678)
ln_trade -0.443 -0.515 -1.299
(1.273) (1.834) (1.900)
ln_rd -0.432 -1.372
(1.615) (2.344)
ln_tertiary -1.244*
(0.754)
Observations 1,431 1,422 1,422 880 636
R2 0.020 0.023 0.023 0.025 0.015
Adjusted R2 -0.048 -0.045 -0.046 -0.069 -0.121
Note: p<0.1; p<0.05; p<0.01
Country and year fixed effects included in all models.
Standard errors clustered by country.
  • p<0.1; ** p<0.05; *** p<0.01
% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com % Date and time: Sun, Feb 15, 2026 - 12:33:17
Show code
bank_quality <- run_twfe_incremental(
  depvar   = "quality_index_100",
  fd       = "ln_bank",
  fd_x_ipr = "ln_bank_x_ipr",
  outfile  = "tables/twfe_bank_quality.tex",
  title    = "Bank Finance and Patent Quality"
)
Bank Finance and Patent Quality
Dependent Variable: Patent Quality (0–100)
quality_index_100
(1) (2) (3) (4) (5)
ln_bank -1.162* -0.989 -0.910 0.213 -0.397
(0.705) (0.752) (0.710) (0.766) (0.884)
ipr_c -1.297 -1.065 -1.907 5.569** 3.667
(2.874) (2.993) (3.376) (2.283) (2.758)
ln_bank_x_ipr 0.481 0.446 0.648 -1.236** -0.722
(0.888) (0.834) (0.915) (0.538) (0.698)
ln_gdp_pc -0.693 -0.988 -1.668 -1.655
(2.532) (2.356) (1.089) (1.021)
ln_trade 3.016 -0.447 -1.335
(1.946) (1.236) (1.526)
ln_rd 1.487 2.405
(1.412) (1.894)
ln_tertiary -0.594
(0.889)
Observations 1,329 1,308 1,302 920 723
R2 0.010 0.008 0.019 0.017 0.012
Adjusted R2 -0.067 -0.069 -0.060 -0.075 -0.111
Note: p<0.1; p<0.05; p<0.01
Country and year fixed effects included in all models.
Standard errors clustered by country.
  • p<0.1; ** p<0.05; *** p<0.01
% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com % Date and time: Sun, Feb 15, 2026 - 12:33:19
Show code
vc_quality <- run_twfe_incremental(
  depvar   = "quality_index_100",
  fd       = "ln_vc",
  fd_x_ipr = "ln_vc_x_ipr",
  outfile  = "tables/twfe_vc_quality.tex",
  title    = "Venture Capital and Patent Quality"
)
Venture Capital and Patent Quality
Dependent Variable: Patent Quality (0–100)
quality_index_100
(1) (2) (3) (4) (5)
ln_vc -0.517 -0.388 -0.354 -0.404 -0.259
(0.550) (0.529) (0.522) (0.608) (0.712)
ipr_c -0.029 0.222 0.286 0.354 0.688
(1.041) (0.988) (1.032) (1.189) (0.980)
ln_vc_x_ipr 0.071 -0.087 -0.119 -0.088 -0.059
(0.701) (0.723) (0.754) (0.849) (0.828)
ln_gdp_pc -1.810 -1.844 -1.986 -3.793
(2.292) (2.269) (2.222) (2.862)
ln_trade 0.401 -0.118 0.953
(1.975) (2.050) (2.306)
ln_rd 1.191 0.718
(1.702) (2.089)
ln_tertiary -3.656
(2.236)
Observations 428 428 428 408 358
R2 0.005 0.008 0.008 0.009 0.024
Adjusted R2 -0.121 -0.121 -0.124 -0.133 -0.142
Note: p<0.1; p<0.05; p<0.01
Country and year fixed effects included in all models.
Standard errors clustered by country.
  • p<0.1; ** p<0.05; *** p<0.01
% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com % Date and time: Sun, Feb 15, 2026 - 12:33:21
Show code
# quantity models

market_quantity <- run_twfe_incremental(
  depvar   = "ln_patents",
  fd       = "ln_market",
  fd_x_ipr = "ln_market_x_ipr",
  outfile  = "tables/twfe_market_quantity.tex",
  title    = "Market Finance and Patent Quantity"
)
Market Finance and Patent Quantity
Dependent Variable: ln(Patents per 1,000 Researchers)
ln_patents
(1) (2) (3) (4) (5)
ln_market 0.077 -0.023 -0.019 -0.027 0.124
(0.079) (0.102) (0.100) (0.095) (0.157)
ipr_c -0.073 -0.207 -0.205 -0.246 -0.383
(0.246) (0.266) (0.263) (0.260) (0.356)
ln_market_x_ipr 0.060 0.065 0.064 0.076 0.086
(0.081) (0.077) (0.075) (0.066) (0.103)
ln_gdp_pc 0.968 0.955 0.989* 0.759
(0.635) (0.607) (0.601) (0.576)
ln_trade -0.039 -0.028 -0.298
(0.313) (0.320) (0.408)
ln_rd -0.240 -0.798
(0.649) (0.524)
ln_tertiary 0.759***
(0.258)
Observations 756 754 754 754 533
R2 0.017 0.070 0.070 0.072 0.243
Adjusted R2 -0.088 -0.030 -0.031 -0.031 0.121
Note: p<0.1; p<0.05; p<0.01
Country and year fixed effects included in all models.
Standard errors clustered by country.
  • p<0.1; ** p<0.05; *** p<0.01
% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com % Date and time: Sun, Feb 15, 2026 - 12:33:23
Show code
bank_quantity <- run_twfe_incremental(
  depvar   = "ln_patents",
  fd       = "ln_bank",
  fd_x_ipr = "ln_bank_x_ipr",
  outfile  = "tables/twfe_bank_quantity.tex",
  title    = "Bank Finance and Patent Quantity"
)
Bank Finance and Patent Quantity
Dependent Variable: ln(Patents per 1,000 Researchers)
ln_patents
(1) (2) (3) (4) (5)
ln_bank 0.085 0.021 0.022 -0.004 0.186
(0.142) (0.166) (0.166) (0.161) (0.210)
ipr_c -0.611 -0.426 -0.468 -0.629 -1.427*
(0.921) (0.661) (0.635) (0.652) (0.801)
ln_bank_x_ipr 0.286 0.156 0.166 0.206 0.379**
(0.287) (0.165) (0.158) (0.158) (0.186)
ln_gdp_pc 1.276** 1.289** 1.379** 1.147**
(0.620) (0.619) (0.624) (0.503)
ln_trade 0.108 0.149 0.208
(0.286) (0.283) (0.291)
ln_rd -0.462 -0.631
(0.552) (0.442)
ln_tertiary 0.514*
(0.269)
Observations 813 801 801 800 614
R2 0.125 0.219 0.220 0.228 0.370
Adjusted R2 0.036 0.138 0.138 0.146 0.279
Note: p<0.1; p<0.05; p<0.01
Country and year fixed effects included in all models.
Standard errors clustered by country.
  • p<0.1; ** p<0.05; *** p<0.01
% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com % Date and time: Sun, Feb 15, 2026 - 12:33:25
Show code
vc_quantity <- run_twfe_incremental(
  depvar   = "ln_patents",
  fd       = "ln_vc",
  fd_x_ipr = "ln_vc_x_ipr",
  outfile  = "tables/twfe_vc_quantity.tex",
  title    = "Venture Capital and Patent Quantity"
)
Venture Capital and Patent Quantity
Dependent Variable: ln(Patents per 1,000 Researchers)
ln_patents
(1) (2) (3) (4) (5)
ln_vc -0.215** -0.222** -0.193** -0.126 -0.002
(0.102) (0.091) (0.089) (0.084) (0.073)
ipr_c -0.338 -0.359* -0.322 -0.343* -0.218
(0.250) (0.214) (0.204) (0.183) (0.156)
ln_vc_x_ipr 0.155 0.163 0.136 0.072 -0.015
(0.111) (0.111) (0.117) (0.123) (0.111)
ln_gdp_pc 0.142 0.118 0.459* 0.775***
(0.401) (0.401) (0.278) (0.230)
ln_trade 0.351 0.502 0.220
(0.415) (0.408) (0.326)
ln_rd -0.962* -0.719*
(0.498) (0.389)
ln_tertiary 0.225
(0.270)
Observations 375 375 375 375 329
R2 0.039 0.040 0.051 0.122 0.082
Adjusted R2 -0.099 -0.101 -0.093 -0.014 -0.083
Note: p<0.1; p<0.05; p<0.01
Country and year fixed effects included in all models.
Standard errors clustered by country.
  • p<0.1; ** p<0.05; *** p<0.01
% Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com % Date and time: Sun, Feb 15, 2026 - 12:33:27

4 zinb models

Show code
# ZINB MODELS

library(data.table)
library(pscl)
library(modelsummary)
library(dplyr)
library(knitr)

df <- fread("data/monarch_panel_transformed.csv")

# center fd variables
df[, ln_bank_c   := ln_bank   - mean(ln_bank,   na.rm = TRUE)]
df[, ln_market_c := ln_market - mean(ln_market, na.rm = TRUE)]
df[, ln_vc_c     := ln_vc     - mean(ln_vc,     na.rm = TRUE)]

df[, ln_bank_x_ipr_c   := ln_bank_c   * ipr_c]
df[, ln_market_x_ipr_c := ln_market_c * ipr_c]
df[, ln_vc_x_ipr_c     := ln_vc_c     * ipr_c]

df[, ln_internal_patents := log(patent_count)]

if (!dir.exists("tables")) {
  dir.create("tables")
}

# zinb with incremental controls
run_zinb_incremental <- function(finance_var, latex_name, title_name) {

  fd_c  <- paste0(finance_var, "_c")
  fd_x  <- paste0(finance_var, "_x_ipr_c")

  controls <- c("ln_gdp_pc", "ln_trade", "ln_rd", "ln_tertiary")

  models <- list()

  for (i in 1:4) {

    intensity_controls <- paste(controls[1:i], collapse = " + ")

    formula_str <- paste0(
      "breakthrough_count ~ ",
      fd_c, " + ipr_c + ", fd_x,
      " + ", intensity_controls,
      " + offset(ln_internal_patents) | ",
      "ln_gdp_pc"
    )

    model <- zeroinfl(
      as.formula(formula_str),
      data = df,
      dist = "negbin"
    )

    models[[i]] <- model
  }

# save latex tables
  modelsummary(
    models,
    output = latex_name,
    stars = TRUE,
    statistic = "({std.error})",
    coef_omit = "Intercept",
    gof_omit = "IC|Log|Adj|Pseudo",
    title = title_name,
    notes = c(
      "Zero-Inflated Negative Binomial model.",
      "Count equation models breakthrough intensity conditional on entry.",
      "Selection equation models structural non-entry.",
      "Selection equation includes ln_gdp_pc only.",
      "Offset = log(total patents).",
      "Standard errors in parentheses."
    )
  )

# save html

  cat("\n\n")
cat("##", title_name, "\n\n")

for (j in 1:4) {

  cat("### Model", j, "\n\n")

  model_sum <- summary(models[[j]])

#intensity model
  count_coefs <- model_sum$coefficients$count

  count_table <- data.frame(
    Variable = rownames(count_coefs),
    Coefficient = round(count_coefs[,1], 3),
    Std_Error = round(count_coefs[,2], 3)
  )

  cat("#### Intensity (Negative Binomial Count Equation)\n\n")
  print(knitr::kable(count_table))
  cat("\n\n")

# selection model
  zero_coefs <- model_sum$coefficients$zero

  zero_table <- data.frame(
    Variable = rownames(zero_coefs),
    Coefficient = round(zero_coefs[,1], 3),
    Std_Error = round(zero_coefs[,2], 3)
  )

  cat("#### Selection (Logit Structural Zero Equation)\n\n")
  print(knitr::kable(zero_table))
  cat("\n\n")

  cat("Observations:", models[[j]]$n, "\n\n")
}

  invisible(models)
}

# run mkt
zinb_market <- run_zinb_incremental(
  "ln_market",
  "tables/zinb_market.tex",
  "ZINB: Market Finance and Breakthrough Innovation"
)

4.1 ZINB: Market Finance and Breakthrough Innovation

4.1.1 Model 1

4.1.1.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -9.415 0.761
ln_market_c ln_market_c -0.064 0.051
ipr_c ipr_c -0.457 0.062
ln_market_x_ipr_c ln_market_x_ipr_c 0.215 0.061
ln_gdp_pc ln_gdp_pc 0.372 0.074
Log(theta) Log(theta) 0.491 0.079

4.1.1.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 12.733 5.280
ln_gdp_pc ln_gdp_pc -1.820 0.709

Observations: 1450

4.1.2 Model 2

4.1.2.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -9.275 0.732
ln_market_c ln_market_c -0.014 0.052
ipr_c ipr_c -0.523 0.062
ln_market_x_ipr_c ln_market_x_ipr_c 0.160 0.062
ln_gdp_pc ln_gdp_pc 0.485 0.074
ln_trade ln_trade -0.310 0.051
Log(theta) Log(theta) 0.616 0.085

4.1.2.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 12.296 7.049
ln_gdp_pc ln_gdp_pc -1.819 0.972

Observations: 1450

4.1.3 Model 3

4.1.3.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -9.801 0.802
ln_market_c ln_market_c 0.206 0.166
ipr_c ipr_c -0.539 0.148
ln_market_x_ipr_c ln_market_x_ipr_c -0.024 0.176
ln_gdp_pc ln_gdp_pc 0.399 0.090
ln_trade ln_trade -0.178 0.077
ln_rd ln_rd 0.756 0.160
Log(theta) Log(theta) 0.750 0.106

4.1.3.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 15.356 19.037
ln_gdp_pc ln_gdp_pc -2.334 2.734

Observations: 890

4.1.4 Model 4

4.1.4.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -9.515 0.988
ln_market_c ln_market_c 0.085 0.201
ipr_c ipr_c -0.611 0.182
ln_market_x_ipr_c ln_market_x_ipr_c 0.098 0.218
ln_gdp_pc ln_gdp_pc 0.196 0.106
ln_trade ln_trade 0.051 0.111
ln_rd ln_rd 0.836 0.180
ln_tertiary ln_tertiary 0.175 0.161
Log(theta) Log(theta) 0.782 0.131

4.1.4.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 15.713 14.055
ln_gdp_pc ln_gdp_pc -2.307 1.993

Observations: 641

Show code
# run bank
zinb_bank <- run_zinb_incremental(
  "ln_bank",
  "tables/zinb_bank.tex",
  "ZINB: Bank Finance and Breakthrough Innovation"
)

4.2 ZINB: Bank Finance and Breakthrough Innovation

4.2.1 Model 1

4.2.1.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -9.453 0.761
ln_bank_c ln_bank_c 0.207 0.097
ipr_c ipr_c -0.509 0.071
ln_bank_x_ipr_c ln_bank_x_ipr_c 0.143 0.096
ln_gdp_pc ln_gdp_pc 0.373 0.074
Log(theta) Log(theta) 0.590 0.089

4.2.1.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 11.465 4.165
ln_gdp_pc ln_gdp_pc -1.629 0.554

Observations: 1337

4.2.2 Model 2

4.2.2.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -9.553 0.721
ln_bank_c ln_bank_c 0.156 0.098
ipr_c ipr_c -0.491 0.070
ln_bank_x_ipr_c ln_bank_x_ipr_c -0.023 0.102
ln_gdp_pc ln_gdp_pc 0.528 0.075
ln_trade ln_trade -0.352 0.056
Log(theta) Log(theta) 0.716 0.094

4.2.2.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 10.419 6.481
ln_gdp_pc ln_gdp_pc -1.587 0.898

Observations: 1331

4.2.3 Model 3

4.2.3.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -9.948 0.689
ln_bank_c ln_bank_c -0.030 0.190
ipr_c ipr_c -0.636 0.136
ln_bank_x_ipr_c ln_bank_x_ipr_c 0.456 0.193
ln_gdp_pc ln_gdp_pc 0.355 0.084
ln_trade ln_trade -0.044 0.084
ln_rd ln_rd 0.722 0.155
Log(theta) Log(theta) 0.769 0.106

4.2.3.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 832.493 1195.555
ln_gdp_pc ln_gdp_pc -124.179 178.258

Observations: 929

4.2.4 Model 4

4.2.4.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -8.927 0.973
ln_bank_c ln_bank_c -0.205 0.217
ipr_c ipr_c -0.597 0.161
ln_bank_x_ipr_c ln_bank_x_ipr_c 0.686 0.226
ln_gdp_pc ln_gdp_pc 0.069 0.105
ln_trade ln_trade 0.254 0.112
ln_rd ln_rd 0.961 0.173
ln_tertiary ln_tertiary 0.059 0.158
Log(theta) Log(theta) 0.795 0.121

4.2.4.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 17.435 10.774
ln_gdp_pc ln_gdp_pc -2.464 1.526

Observations: 728

Show code
# run vc
zinb_vc <- run_zinb_incremental(
  "ln_vc",
  "tables/zinb_vc.tex",
  "ZINB: Venture Capital and Breakthrough Innovation"
)

4.3 ZINB: Venture Capital and Breakthrough Innovation

4.3.1 Model 1

4.3.1.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -8.201 1.618
ln_vc_c ln_vc_c -0.049 0.145
ipr_c ipr_c -0.887 0.192
ln_vc_x_ipr_c ln_vc_x_ipr_c 0.434 0.146
ln_gdp_pc ln_gdp_pc 0.283 0.148
Log(theta) Log(theta) 1.015 0.136

4.3.1.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 39.580 9.254
ln_gdp_pc ln_gdp_pc -4.166 0.940

Observations: 428

4.3.2 Model 2

4.3.2.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -8.403 1.631
ln_vc_c ln_vc_c -0.042 0.145
ipr_c ipr_c -0.932 0.198
ln_vc_x_ipr_c ln_vc_x_ipr_c 0.405 0.150
ln_gdp_pc ln_gdp_pc 0.342 0.162
ln_trade ln_trade -0.087 0.100
Log(theta) Log(theta) 1.021 0.137

4.3.2.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 38.812 9.304
ln_gdp_pc ln_gdp_pc -4.093 0.944

Observations: 428

4.3.3 Model 3

4.3.3.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -10.171 1.986
ln_vc_c ln_vc_c -0.154 0.164
ipr_c ipr_c -0.901 0.228
ln_vc_x_ipr_c ln_vc_x_ipr_c 0.469 0.162
ln_gdp_pc ln_gdp_pc 0.428 0.190
ln_trade ln_trade -0.040 0.106
ln_rd ln_rd 0.507 0.230
Log(theta) Log(theta) 1.015 0.139

4.3.3.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 36.476 11.43
ln_gdp_pc ln_gdp_pc -3.914 1.16

Observations: 408

4.3.4 Model 4

4.3.4.1 Intensity (Negative Binomial Count Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) -8.844 2.939
ln_vc_c ln_vc_c -0.024 0.178
ipr_c ipr_c -0.636 0.256
ln_vc_x_ipr_c ln_vc_x_ipr_c 0.358 0.186
ln_gdp_pc ln_gdp_pc 0.202 0.233
ln_trade ln_trade 0.083 0.128
ln_rd ln_rd 0.483 0.253
ln_tertiary ln_tertiary 0.056 0.262
Log(theta) Log(theta) 0.994 0.146

4.3.4.2 Selection (Logit Structural Zero Equation)

Variable Coefficient Std_Error
(Intercept) (Intercept) 52.394 18.440
ln_gdp_pc ln_gdp_pc -5.502 1.936

Observations: 358

5 data diagnostics and graphs

5.1 breakthrough

Show code
library(ggplot2)
library(dplyr)

# breakthrough count diagnostics (justifies model choice)

df <- fread("data/monarch_panel_transformed.csv")

cat("Total observations:", nrow(df), "\n")
Total observations: 2783 
Show code
cat("Zero breakthrough observations:", sum(df$breakthrough_count == 0, na.rm = TRUE), "\n")
Zero breakthrough observations: 1933 
Show code
cat("Percent zeros:", round(mean(df$breakthrough_count == 0, na.rm = TRUE) * 100, 2), "%\n")
Percent zeros: 69.46 %
Show code
cat("Mean breakthrough count:", mean(df$breakthrough_count, na.rm = TRUE), "\n")
Mean breakthrough count: 4.850521 
Show code
cat("Variance breakthrough count:", var(df$breakthrough_count, na.rm = TRUE), "\n")
Variance breakthrough count: 439.1753 
Show code
ggplot(df, aes(x = breakthrough_count)) +
  geom_histogram(binwidth = 1, fill = "steelblue1", color = "white") +
  coord_cartesian(xlim = c(0, 30)) +
  labs(
    title = "Distribution of Breakthrough Counts",
    x = "Breakthrough Count (Top 1% Patents)",
    y = "Frequency"
  ) +
  theme_minimal()

5.2 patent quality distribution

Show code
ggplot(df, aes(x = quality_index_100)) +
  geom_density(fill = "deeppink1", alpha = 0.4) +
  labs(
    title = "Distribution of Patent Quality Index",
    x = "Patent Quality (0-100)",
    y = "Density"
  ) +
  theme_minimal()

5.3 distibution of FD variables

Show code
library(dplyr)
library(tidyverse)
vars <- c("ln_market", "ln_bank", "ln_vc")

df_long <- df %>%
  select(all_of(vars)) %>%
  pivot_longer(everything())

ggplot(df_long, aes(x = value)) +
  geom_density(fill = "deeppink1", alpha = 0.4) +
  facet_wrap(~name, scales = "free") +
  theme_minimal() +
  labs(title = "Distribution of Financial Development Measures")

5.4 corr plot

Show code
library(corrplot)

corr_vars <- df %>%
  select(quality_index_100, ln_patents,
         ln_market, ln_bank, ln_vc,
         ipr_c, ln_gdp_pc, ln_trade,
         ln_rd, ln_tertiary) %>%
  na.omit()

corr_matrix <- cor(corr_vars)

corrplot(corr_matrix, method = "color", type = "upper",
         tl.cex = 0.8, number.cex = 0.7)

5.5 interaction plots (marginal effects)

Show code
library(plm)

twfe_quality <- plm(
  quality_index_100 ~ ln_market * ipr_c +
    ln_bank + ln_vc,
  data = df,
  index = c("iso3c", "year"),
  model = "within",
  effect = "twoways"
)

twfe_quantity <- plm(
  ln_patents ~ ln_market * ipr_c +
    ln_bank + ln_vc,
  data = df,
  index = c("iso3c", "year"),
  model = "within",
  effect = "twoways"
)
library(marginaleffects)

ipr_sd <- sd(df$ipr_c, na.rm = TRUE)

ln_market_seq <- seq(
  min(df$ln_market, na.rm = TRUE),
  max(df$ln_market, na.rm = TRUE),
  length.out = 100
)

ipr_values <- c(-ipr_sd, 0, ipr_sd)

# Quality
plot_predictions(
  twfe_quality,
  condition = list(
    ln_market = ln_market_seq,
    ipr_c = ipr_values
  )
) +
  labs(
    title = "Market Finance × IPR Interaction (TWFE)",
    x = "Market Finance (log)",
    y = "Predicted Patent Quality"
  ) +
  theme_minimal()

Show code
# Quantity
plot_predictions(
  twfe_quantity,
  condition = list(
    ln_market = ln_market_seq,
    ipr_c = ipr_values
  )
) +
  labs(
    title = "Market Finance × IPR Interaction (TWFE)",
    x = "Market Finance (log)",
    y = "Predicted Log Patents"
  ) +
  theme_minimal()

5.6 more graphs

Show code
# distibution of patent quality
library(tidyverse)
library(data.table)

df <- fread("data/monarch_panel_transformed.csv")

# Create bins
df_plot <- df %>%
  filter(!is.na(ln_market), !is.na(ipr_c), !is.na(quality_index_100)) %>%
  mutate(
    market_bin = ntile(ln_market, 3),
    ipr_regime = ifelse(ipr_c <= median(ipr_c, na.rm = TRUE),
                        "Weak IPR", "Strong IPR"),
    group = paste0("Market Q", market_bin, " | ", ipr_regime)
  )

ggplot(df_plot, aes(x = quality_index_100, fill = ipr_regime)) +
  geom_density(alpha = 0.4) +
  facet_wrap(~ market_bin, nrow = 1,
             labeller = labeller(market_bin = function(x) paste("Market Q", x))) +
  labs(
    title = "Innovation Quality Distributions by Market Finance and IPR",
    subtitle = "",
    x = "Patent Quality Index",
    y = "Density",
    fill = "IPR Regime"
  ) +
  theme_minimal()

Show code
# quality and quantity relationship
ggplot(df, aes(x = ln_patents, y = quality_index_100)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method = "loess", se = FALSE, color= 'deeppink2') +
  labs(
    title = "Average Patent Quality and Patent Quantity",
    subtitle = "",
    x = "Patent Quantity (log)",
    y = "Average Patent Quality"
  ) +
  theme_minimal()

Show code
library(tidyverse)
# scatterplots
plot_df <- df %>%
  select(
    iso3c, year,
    quality_index_100,
    ln_patents,
    ipr_c,
    ln_bank, ln_market, ln_vc
  ) %>%
  pivot_longer(
    cols = c(ln_bank, ln_market, ln_vc),
    names_to = "finance_type",
    values_to = "finance_depth"
  ) %>%
  mutate(
    finance_type = case_match(
      finance_type,
      "ln_bank"   ~ "Bank Finance",
      "ln_market" ~ "Market Finance",
      "ln_vc"     ~ "Venture Capital",
      .default = finance_type
    )
  )



# fd x quality
p1 <- ggplot(plot_df, aes(x = finance_depth, y = quality_index_100)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 1, color = 'deeppink2') +
  facet_wrap(~ finance_type, scales = "free_x") +
  labs(
    title = "Financial Development and Patent Quality",
    subtitle = "",
    x = "Financial Depth (log)",
    y = "Patent Quality Index (0-100)"
  ) +
  theme_minimal()

# fd x quantity
p2 <- ggplot(plot_df, aes(x = finance_depth, y = ln_patents)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 1, color = 'deeppink2') +
  facet_wrap(~ finance_type, scales = "free_x") +
  labs(
    title = "Financial Development and Patent Quantity",
    subtitle = "",
    x = "Financial Depth (log)",
    y = "Log Patents per 1,000 Researchers"
  ) +
  theme_minimal()

# ipr x quality
p3 <- ggplot(df, aes(x = ipr_c, y = quality_index_100)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 1, color = 'deeppink2') +
  labs(
    title = "IPR Strength and Patent Quality",
    subtitle = "Stronger IPR regimes show a weak relationship with quality",
    x = "IPR Strength (Centered Park Index)",
    y = "Patent Quality Index (0-100)"
  ) +
  theme_minimal()

# ipr x quantity
p4 <- ggplot(df, aes(x = ipr_c, y = ln_patents)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "loess", se = FALSE, linewidth = 1, color = 'deeppink2') +
  labs(
    title = "IPR Strength and Patent Quantity",
    subtitle = "Stronger IPR regimes file more patents on average",
    x = "IPR Strength (Centered Park Index)",
    y = "Log Patents per 1,000 Researchers"
  ) +
  theme_minimal()

print(p1)

Show code
print(p2)

Show code
print(p3)

Show code
print(p4)

Show code
quality_market_df <- df %>%
  filter(
    !is.na(quality_index_100),
    !is.na(ln_market),
    !is.na(ipr_c)
  )

quality_market_ipr_df <- quality_market_df %>%
  mutate(
    ipr_regime = ifelse(ipr_c >= 0, "Above-Average IPR", "Below-Average IPR")
  )

ggplot(quality_market_ipr_df,
       aes(x = ln_market, y = quality_index_100, color = ipr_regime)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Patent Quality, Market Finance, and IPR",
    x = "Log Market Capitalization / GDP",
    y = "Patent Quality Index (0-100)",
    color = "IP Regime"
  ) +
  theme_minimal()

6 robustness checks

  • quadratic models
  • lags
  • idk some other stuff

6.1 joint tests and nonlinearity tests

Show code
# robustness and model analysis

library(dplyr)
library(plm)
library(lmtest)
library(sandwich)
library(car)

# center fd variables
df <- fread("data/monarch_panel_transformed.csv")

df_ext <- df %>%
  mutate(
    ln_bank_c   = ln_bank   - mean(ln_bank,   na.rm = TRUE),
    ln_market_c = ln_market - mean(ln_market, na.rm = TRUE),
    ln_vc_c     = ln_vc     - mean(ln_vc,     na.rm = TRUE),
    
    ln_bank_x_ipr   = ln_bank_c * ipr_c,
    ln_market_x_ipr = ln_market_c * ipr_c,
    ln_vc_x_ipr     = ln_vc_c * ipr_c,

    ln_bank_c2   = ln_bank_c^2,
    ln_market_c2 = ln_market_c^2,
    ln_vc_c2     = ln_vc_c^2
  )

panel_ext <- pdata.frame(df_ext, index = c("iso3c", "year"))

# clustered se
vcov_clust <- function(x) vcovHC(x, method = "arellano", cluster = "group")

# run models, see where squared terms are justified

cat("\n testing for nonlinearity \n")

 testing for nonlinearity 
Show code
#  formulas to check squared term p-values

check_quad <- function(dv, base_var) {
  f <- as.formula(paste0(dv, " ~ ", base_var, "_c + ", base_var, "_c2 + ipr_c + ", 
                        base_var, "_x_ipr + ln_gdp_pc + ln_trade + ln_rd"))
  
  m <- plm(f, data = panel_ext, model = "within", effect = "twoways")
  
  cat("\nChecking Quadratic for:", dv, "vs", base_var, "\n")
  # square term included
  print(coeftest(m, vcov = vcov_clust(m))[2, ]) 
}

check_quad("quality_index_100", "ln_market")

Checking Quadratic for: quality_index_100 vs ln_market 
  Estimate Std. Error    t value   Pr(>|t|) 
 0.1372631  0.1490833  0.9207139  0.3574770 
Show code
check_quad("quality_index_100", "ln_bank")    # EXPECTED: significant

Checking Quadratic for: quality_index_100 vs ln_bank 
  Estimate Std. Error    t value   Pr(>|t|) 
1.51142138 0.63335568 2.38637059 0.01723636 
Show code
check_quad("quality_index_100", "ln_vc")

Checking Quadratic for: quality_index_100 vs ln_vc 
  Estimate Std. Error    t value   Pr(>|t|) 
 0.0624900  0.1570784  0.3978269  0.6909969 
Show code
check_quad("ln_patents",        "ln_market")

Checking Quadratic for: ln_patents vs ln_market 
   Estimate  Std. Error     t value    Pr(>|t|) 
-0.01886422  0.03126178 -0.60342743  0.54642649 
Show code
check_quad("ln_patents",        "ln_bank")

Checking Quadratic for: ln_patents vs ln_bank 
  Estimate Std. Error    t value   Pr(>|t|) 
0.02737307 0.07257386 0.37717529 0.70615433 
Show code
check_quad("ln_patents",        "ln_vc")        # EXPECTED: significant

Checking Quadratic for: ln_patents vs ln_vc 
  Estimate Std. Error    t value   Pr(>|t|) 
0.07698413 0.03183525 2.41820396 0.01615012 
Show code
# final models and joint tests
cat("\n joint significance tests \n")

 joint significance tests 
Show code
run_final <- function(dv, base_var, use_quad = FALSE) {
  
  main_vars <- if(use_quad) {
    paste0(base_var, "_c + ", base_var, "_c2")
  } else {
    paste0(base_var, "_c")
  }
  
  f <- as.formula(paste0(dv, " ~ ", main_vars, " + ipr_c + ", base_var, "_x_ipr + ln_gdp_pc + ln_trade + ln_rd"))
  m <- plm(f, data = panel_ext, model = "within", effect = "twoways")
  
  cat("\n", paste(rep("-", 40), collapse = ""), "\n")
  cat("FINAL MODEL:", dv, "by", base_var, "(Quadratic =", use_quad, ")\n")
  
  # joint signif
  hyp <- if(use_quad) {
    c(paste0(base_var, "_c = 0"), paste0(base_var, "_c2 = 0"), paste0(base_var, "_x_ipr = 0"))
  } else {
    c(paste0(base_var, "_c = 0"), paste0(base_var, "_x_ipr = 0"))
  }
  
  cat("\n[Joint Significance Test]\n")
  print(linearHypothesis(m, hyp, vcov = vcov_clust))
  
  # within fit
  cat("\n[Within R-squared]:", summary(m)$r.squared[1], "\n")
}

# quality final
run_final("quality_index_100", "ln_market", use_quad = FALSE)

 ---------------------------------------- 
FINAL MODEL: quality_index_100 by ln_market (Quadratic = FALSE )

[Joint Significance Test]

Linear hypothesis test:
ln_market_c = 0
ln_market_x_ipr = 0

Model 1: restricted model
Model 2: quality_index_100 ~ ln_market_c + ipr_c + ln_market_x_ipr + ln_gdp_pc + 
    ln_trade + ln_rd

Note: Coefficient covariance matrix supplied.

  Res.Df Df  Chisq Pr(>Chisq)   
1    804                        
2    802  2 11.486   0.003206 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

[Within R-squared]: 0.02452826 
Show code
run_final("quality_index_100", "ln_bank",   use_quad = TRUE)

 ---------------------------------------- 
FINAL MODEL: quality_index_100 by ln_bank (Quadratic = TRUE )

[Joint Significance Test]

Linear hypothesis test:
ln_bank_c = 0
ln_bank_c2 = 0
ln_bank_x_ipr = 0

Model 1: restricted model
Model 2: quality_index_100 ~ ln_bank_c + ln_bank_c2 + ipr_c + ln_bank_x_ipr + 
    ln_gdp_pc + ln_trade + ln_rd

Note: Coefficient covariance matrix supplied.

  Res.Df Df  Chisq Pr(>Chisq)  
1    842                       
2    839  3 10.585     0.0142 *
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

[Within R-squared]: 0.03938897 
Show code
run_final("quality_index_100", "ln_vc",     use_quad = FALSE)

 ---------------------------------------- 
FINAL MODEL: quality_index_100 by ln_vc (Quadratic = FALSE )

[Joint Significance Test]

Linear hypothesis test:
ln_vc_c = 0
ln_vc_x_ipr = 0

Model 1: restricted model
Model 2: quality_index_100 ~ ln_vc_c + ipr_c + ln_vc_x_ipr + ln_gdp_pc + 
    ln_trade + ln_rd

Note: Coefficient covariance matrix supplied.

  Res.Df Df  Chisq Pr(>Chisq)
1    358                     
2    356  2 2.0289     0.3626

[Within R-squared]: 0.008877963 
Show code
# quantity final
run_final("ln_patents",        "ln_market", use_quad = FALSE)

 ---------------------------------------- 
FINAL MODEL: ln_patents by ln_market (Quadratic = FALSE )

[Joint Significance Test]

Linear hypothesis test:
ln_market_c = 0
ln_market_x_ipr = 0

Model 1: restricted model
Model 2: ln_patents ~ ln_market_c + ipr_c + ln_market_x_ipr + ln_gdp_pc + 
    ln_trade + ln_rd

Note: Coefficient covariance matrix supplied.

  Res.Df Df  Chisq Pr(>Chisq)
1    680                     
2    678  2 1.3412     0.5114

[Within R-squared]: 0.07196042 
Show code
run_final("ln_patents",        "ln_bank",   use_quad = FALSE)

 ---------------------------------------- 
FINAL MODEL: ln_patents by ln_bank (Quadratic = FALSE )

[Joint Significance Test]

Linear hypothesis test:
ln_bank_c = 0
ln_bank_x_ipr = 0

Model 1: restricted model
Model 2: ln_patents ~ ln_bank_c + ipr_c + ln_bank_x_ipr + ln_gdp_pc + 
    ln_trade + ln_rd

Note: Coefficient covariance matrix supplied.

  Res.Df Df  Chisq Pr(>Chisq)
1    724                     
2    722  2 2.4426     0.2949

[Within R-squared]: 0.2278577 
Show code
run_final("ln_patents",        "ln_vc",     use_quad = TRUE)

 ---------------------------------------- 
FINAL MODEL: ln_patents by ln_vc (Quadratic = TRUE )

[Joint Significance Test]

Linear hypothesis test:
ln_vc_c = 0
ln_vc_c2 = 0
ln_vc_x_ipr = 0

Model 1: restricted model
Model 2: ln_patents ~ ln_vc_c + ln_vc_c2 + ipr_c + ln_vc_x_ipr + ln_gdp_pc + 
    ln_trade + ln_rd

Note: Coefficient covariance matrix supplied.

  Res.Df Df  Chisq Pr(>Chisq)  
1    326                       
2    323  3 7.7783    0.05082 .
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

[Within R-squared]: 0.1477852 

7 Variable Glossary

Identifiers

  • iso3c - Three-letter ISO country code. Primary country identifier used to merge OECD PQI, financial development, institutional, and macroeconomic datasets.
  • year - Observation year (country-year panel structure).
  • Country Name - Human-readable country name (not used in estimation; included for clarity).

Patent Quality Indicators (OECD PQI)

All quality indicators are aggregated to the country-year level using the mean across patents.

  • avg_patent_scope_norm - Average normalized patent scope; number of jurisdictions in which a patent family is filed (proxy for economic value and international relevance).
  • avg_family_size_norm - Average normalized patent family size; number of related patent filings across offices (proxy for strategic and economic importance).
  • avg_grant_lag_norm - Average normalized grant lag; time between application and grant (proxy for complexity).
  • avg_bwd_cits_norm - Average normalized backward citations; references to prior patents (knowledge base intensity).
  • avg_npl_cits_norm - Average normalized non-patent literature citations; references to scientific publications (science linkage).
  • avg_claims_norm - Average normalized number of claims; legal scope of protection.
  • avg_fwd_cits5_norm - Average normalized forward citations within 5 years; short-run technological impact.
  • avg_fwd_cits7_norm - Average normalized forward citations within 7 years; medium-run technological impact.
  • avg_generality - Technological breadth index (0-1); dispersion of forward citations across technology classes.
  • avg_originality - Citation diversity index (0-1); dispersion of backward citations across technology classes.
  • avg_radicalness - Novelty index (0-1); degree to which cited prior art differs from focal patent classification.
  • avg_quality_index_4 - Composite patent quality index based on four core indicators.
  • avg_quality_index_6 - Composite patent quality index based on six indicators.
  • quality_index_100 - avg_quality_index_4 * 100 for interpretability (used in TWFE models).
  • avg_renewal - Average years a patent is renewed; proxy for economic value (where available).

Breakthrough Innovation Variables

  • breakthrough - (patent-level, in Python step) Binary indicator equal to 1 if the patent belongs to the top 1% most cited patents within cohort, 0 otherwise.
  • breakthrough_count - Country-year count of breakthrough patents (sum of top 1% patents).
  • patent_count - Total number of patents in OECD PQI data for that country–year.
  • pct_breakthrough - Share of patents in a country-year that qualify as breakthroughs: breakthrough_count / patent_count.
  • ln_internal_patents - Log of patent_count; used as an offset in count models.

Patent Quantity Measures

  • patent_applications_residents - Resident patent applications (WDI source).
  • researchers_per_million - Researchers per million inhabitants; used to normalize patent output.
  • population_total - Total population; used to compute total researchers.
  • researchers_total - Computed as: researchers_per_million * (population_total / 1,000,000).
  • patents_per_1000_researchers - Scaled patent intensity measure: patent_applications_residents / researchers_total * 1000.
  • ln_patents - Log transformation of patents per 1,000 researchers (+1 shift).

Financial Development Variables

  • bank_credit_gdp - Domestic credit to private sector (% of GDP). Proxy for banking sector depth.
  • market_cap_gdp - Market capitalization of listed domestic companies (% of GDP). Proxy for stock market depth.
  • VC_investment - Venture capital investment as % of GDP.
  • VC_investment_pct - VC_investment * 100, rescaled to match percentage units of other financial variables.
  • ln_bank - Log of bank_credit_gdp + 1.
  • ln_market - Log of market_cap_gdp + 1.
  • ln_vc - Log of VC_investment_pct + 1.
  • [fd_var]_x_ipr - Interaction term between financial development variable and institutional quality.
  • [fd_var]_c - Mean-centered financial variable (used only in ZINB models).
  • [fd_var]_x_ipr_c - Interaction between centered finance variable and centered IPR (ZINB only).

Institutional Variable

  • ipp_index_IP_transformed - Park Intellectual Property Protection Index (0-5 scale); five-year fill interpolation.
  • ipr_c - Mean-centered Park index. Used to improve interpretation of interaction coefficients.

Macroeconomic Controls

All controls are log-transformed (+1 shift where necessary).

  • gdp_pc_const2015usd - GDP per capita (constant 2015 USD).
  • ln_gdp_pc - Log GDP per capita.
  • trade_percent_gdp - Trade openness (% of GDP).
  • ln_trade - Log trade openness.
  • rd_expenditure_percent_gdp - R&D expenditure (% of GDP).
  • ln_rd - Log R&D expenditure.
  • tertiary_enrollment_percent - Gross tertiary enrollment (%).
  • ln_tertiary - Log tertiary enrollment.

Model-Specific Parameters

  • Log(theta) - Log of dispersion parameter in negative binomial models. Captures overdispersion relative to Poisson.